#Sample outliers can be detected based on the distribution of read counts between samples which are visualised using a box and whisker plot of log2 counts per milliom (CPM) for each sample. CPM is used to correct for differences in library size (ie depth of sequencing per sample) which are logged because the count data are not normally distributed. We will use the cpm function in edgeR to get log2(CPM) which also adds a small offset to avoid taking a log of zero.
# Get log2 counts per million
logcounts <- cpm(edgeRData,log=TRUE)
# Check distributions of samples using boxplots
boxplot(logcounts, xlab="", ylab="Log2 counts per million",las=2)
# Let's add a blue horizontal line that corresponds to the median logCPM
abline(h=median(logcounts),col="blue")
title(xlab="Sample", line=5, cex.lab=1.1)
title("Boxplots of log-2 CPMs after normalisation")
# Chunk 15
#Create design matrix
design <- model.matrix(~0+group, data=edgeRData$samples)
colnames(design) <- levels(edgeRData$samples$group)
design
# create contrast matrix
my.contrasts <- makeContrasts(RH30_P_vs_RH_HD=RH_HD-RH30_P,
RH30_P_vs_RH_C7=RH_C7-RH30_P,
RH30_P_vs_RH_C13=RH_C13-RH30_P,
levels=design)
#Estimate dispersion
edgeRData <- estimateDisp(edgeRData, design, robust=TRUE)
# note values of common dispersion and BCV
edgeRData$common.dispersion
# Chunk 16
plotBCV(edgeRData, xlab="Average log CPM", ylab="Biological coefficient of variation", pch=14, cex=0.2, col.common="red", col.trend="blue", col.tagwise="black")
edgeRData$common.dispersion
plotBCV(edgeRData)
# Chunk 17
fit <- glmQLFit(edgeRData, design)
RH30_P_vs_RH_HD <- glmQLFTest(fit, contrast=my.contrasts[,"RH30_P_vs_RH_HD"])
RH30_P_vs_RH_C7 <- glmQLFTest(fit, contrast=my.contrasts[,"RH30_P_vs_RH_C7"])
RH30_P_vs_RH_C13 <- glmQLFTest(fit, contrast=my.contrasts[,"RH30_P_vs_RH_C13"])
### add FDR to each sample ########
RH30_P_vs_RH_HD$table$FDR <- p.adjust(RH30_P_vs_RH_HD$table$PValue, method = "BH")
RH30_P_vs_RH_C7$table$FDR <- p.adjust(RH30_P_vs_RH_C7$table$PValue, method = "BH")
RH30_P_vs_RH_C13$table$FDR <- p.adjust(RH30_P_vs_RH_C13$table$PValue, method = "BH")
# Chunk 18
dge_files <- list(RH30_P_vs_RH_HD, RH30_P_vs_RH_C7, RH30_P_vs_RH_C13)
filenames <- c("RH30_P_vs_RH_HD", "RH30_P_vs_RH_C7", "RH30_P_vs_RH_C13")
for (i in 1:length(dge_files)) {
#Save number of up/down significant genes at FDR = 0.05
j <- filenames[i]
d <- get(j)
dge_sum <-summary(de <- decideTestsDGE(d, adjust.method = "BH", p.value = 0.05 ,lfc = 1))
print(dge_sum)
dge_sum <- data.frame(rbind(dge_sum))
# Print top genes
print(topTags(d, 10, adjust.method = "BH", p.value = 0.05, sort.by = "PValue"))
detags <- rownames(edgeRData)[as.logical(de)]
plotSmear(d, de.tags=detags, cex=1, xlab="Average logCPM", ylab="logFC")
# make subset of data
data <- cbind(d$table)
# add FDR column to dge object
data$FDR <- p.adjust(data$PValue, method = "BH")
head(data)
#   #get csv of genes
#   gene_list <- read.csv("hub_genes.csv")
# gene_list <- gene_list$x
#subset data selecting for genes in genelist
# ils <- vol_plot_data[row.names(vol_plot_data) %in% gene_list,]
#  write.csv(ils, file=paste0("Hub_gene_expression_", i,".csv") )
############## Volcano Plots ################
data <- data %>%
mutate(
Expression = case_when(logFC > 1 & FDR < 0.05 ~ "Up-regulated",
logFC < -1 & FDR < 0.05 ~ "Down-regulated",
TRUE ~ "Unchanged")
)
head(data) %>%
kable()
#save list of unfiltered DEGs
#write.csv(data, file=paste0("unfiltered_DEGs_", j,".csv"))
#get DEGs
#data <- data[data$Expression != "Unchanged", ]
#save list of unfiltered DEGs
#write.csv(data, file=paste0("DEGs_LFC_1_", j,".csv"))
# Add colour, size and alpha (transparency) to volcano plot --------------------
cols <- c("Up-regulated" = "#ffad73", "Down-regulated" = "#26b3ff", "Unchanged" = "grey")
sizes <- c("Up-regulated" = 2, "Down-regulated" = 2, "Unchanged" = 1)
alphas <- c("Up-regulated" = 1, "Down-regulated" = 1, "Unchanged" = 0.5)
#plot vol plot highlighting genelist and adding labels
p3 <- ggplot(data = data, # Original data
aes(x = logFC, y = -log(FDR,10))) +
geom_hline(yintercept = -log10(0.05),
linetype = "dashed") +
geom_vline(xintercept = c(-1, 1),
linetype = "dashed") +
scale_colour_manual(values = cols)+
ggtitle(filenames[[i]]) +
geom_point(aes(colour = Expression),
alpha = 0.8,
shape = 16,
size = 1)
top <- 10
top_genes <- bind_rows(
data %>%
filter(Expression == 'Up-regulated') %>%
arrange(FDR, desc(abs(logFC))) %>%
head(top),
data %>%
filter(Expression == 'Down-regulated') %>%
arrange(FDR, desc(abs(logFC))) %>%
head(top)
)
top_genes %>%
kable()
p3 <- p3 +
geom_label_repel(data = top_genes, label.size = 0.5, force = 0.5, max.overlaps = 15,
mapping = aes(logFC, -log(FDR,10), label = row.names(top_genes)),
size = 4.5)
ggsave(p3, file=paste0("Volcano_plot_", j,".png"), width = 25, height = 25, units = "cm")
}
# Chunk 19
df <- read_xlsx("C:/Users/Christina/Documents/RMS RNA seq/Zoes data/No_DEGs.xlsx")
# Stacked
p <- ggplot(df, aes(fill=Direction, y=Value, x=Treatment)) +
geom_bar(position="stack", stat="identity") + ylab("Number of genes") + ggtitle("Number of DEGs intrinsic resistance") + theme(axis.text=element_text(size=14),
axis.title=element_text(size=14))+
scale_fill_manual(values=c('deepskyblue1', 'indianred1'))
ggsave(p, file=paste0("No_DEGs_Zoes.png"), width = 25, height = 25, units = "cm")
# Chunk 20
myCPM <- cpm(edgeRData$counts)
x <- myCPM['DDX1',]
m <- list(counts = as.numeric(x), group = as.factor(edgeRData$samples$group))
m <- as.tibble(m)
#check reorder
#  m$group <- sub("COMB1", "RA+TAZ", m$group)
# m$group <- sub("COMB2", "RA+GSK", m$group)
# add new column with varaible to seperate by timepoint
#names <- m$group
# for (i in 1:54) {
#    string <- names[i]
#m$timepoint[i] <- if(grepl("PT3", string)) {print("Post-treatment day 3")} else if (grepl("PT7", string)) {print("Post-treatment day 7")} else {print("Post-treatment day 10")}}
m$group <- factor(m$group, levels=c('DMSO', "RA", "TAZ", "GSK", "RA+TAZ", "RA+GSK") )
q <- ggplot(m, aes(group, counts)) + geom_boxplot() + geom_jitter(width = 0.1)
q <- q + labs(x = "", y = "Normalized Counts ", title = "RARB")
q <- q + theme(axis.text.x = element_text(angle = 90, vjust = 1, hjust = 1, size=15) ,axis.title.y  = element_text(size = 15))
print(q)
# Chunk 21
######### for C13 and C7 #######################
RH30_P_vs_RH_C13$table$FDR <- p.adjust(RH30_P_vs_RH_C13$table$PValue, method = "BH")
RH30_P_vs_RH_C13$table <- RH30_P_vs_RH_C13$table %>%
mutate(
Expression = case_when(logFC > 1 & FDR < 0.05 ~ "significant",
logFC < -1 & FDR < 0.05 ~ "significant",
TRUE ~ "Unchanged") )
RH30_P_vs_RH_C7$table$FDR <- p.adjust(RH30_P_vs_RH_C7$table$PValue, method = "BH")
RH30_P_vs_RH_C7$table <- RH30_P_vs_RH_C7$table %>%
mutate(
Expression = case_when(logFC > 1 & FDR < 0.05 ~ "significant",
logFC < -1 & FDR < 0.05 ~ "significant",
TRUE ~ "Unchanged") )
df <- merge(RH30_P_vs_RH_C7$table, RH30_P_vs_RH_C13$table, by = 'row.names', all = TRUE)
df <- df %>%
mutate(sig = case_when(Expression.x == "significant" & Expression.y == "significant" ~ "Both",
Expression.x == "significant" ~ "C7",
Expression.y == "significant" ~ "C13",
TRUE ~ "Unchanged") )
scatterplot <- ggplot(df, aes(x=logFC.x, y=logFC.y, color=sig)) + geom_point()   + geom_smooth(method=lm , color="blue", fill="#69b3a2", se=TRUE) + scale_color_manual(values=c("mediumorchid1", "steelblue1", "hotpink1", "grey")) + ggtitle("Differentially expressed genes for C7 vs P and C13 vs P") +
xlab("Log-2 fold change C13 vs P") + ylab("Log-2 fold change C7 vs P") +theme(axis.text=element_text(size=12),
axis.title=element_text(size=12)) + guides(color=guide_legend(title="Treatment differentially expressed in"))
ggplot(df, aes(x=logFC.x, y=logFC.y)) +
geom_point() +
geom_smooth(method=lm , color="red", fill="#69b3a2", se=TRUE) +
stat_regline_equation(label.y = 10) +
stat_cor(label.y = 6)
#ggsave(scatterplot, file=paste0("Scatterplot_TAZ_iso_PT10.png"), width = 25, height = 25, units = "cm")
# + scale_color_viridis(discrete = TRUE, option = "D")  + stat_cor(method = "pearson",  label.x = -5, label.y = 10)
########## for HD and C13 ###########
RH30_P_vs_RH_HD$table$FDR <- p.adjust(RH30_P_vs_RH_HD$table$PValue, method = "BH")
RH30_P_vs_RH_HD$table <- RH30_P_vs_RH_HD$table %>%
mutate(
Expression = case_when(logFC > 1.5 & FDR < 0.05 ~ "significant",
logFC < -1.5 & FDR < 0.05 ~ "significant",
TRUE ~ "Unchanged") )
df <- merge(RH30_P_vs_RH_HD$table, RH30_P_vs_RH_C13$table, by = 'row.names', all = TRUE)
df <- df %>%
mutate(sig = case_when(Expression.x == "significant" & Expression.y == "significant" ~ "Both",
Expression.x == "significant" ~ "HD",
Expression.y == "significant" ~ "C13",
TRUE ~ "Unchanged") )
#df <- df %>%
#   mutate(Significant = case_when(Expression.y == "significant" ~ "RA",TRUE ~ "Unchanged") )
ggplot(df, aes(x=logFC.x, y=logFC.y)) +
geom_point() +
geom_smooth(method=lm , color="red", fill="#69b3a2", se=TRUE) #+ scale_color_viridis(discrete = TRUE, option = "D")
ggplot(df, aes(x=logFC.x, y=logFC.y, color=sig)) + geom_point() + geom_smooth(method=lm , color="red", fill="#69b3a2", se=TRUE) + ggtitle("RH30 C13 and RH30 HD")# + scale_color_viridis(discrete = TRUE, option = "D")
############## for C7 and HD #####################
df <- merge(RH30_P_vs_RH_HD$table, RH30_P_vs_RH_C7$table, by = 'row.names', all = TRUE)
df <- df %>%
mutate(sig = case_when(Expression.x == "significant" & Expression.y == "significant" ~ "Both",
Expression.x == "significant" ~ "HD",
Expression.y == "significant" ~ "C7",
TRUE ~ "Unchanged") )
ggplot(df, aes(x=logFC.x, y=logFC.y, color=sig)) + geom_point() + geom_smooth(method=lm , color="red", fill="#69b3a2", se=TRUE) + ggtitle("RH30 C7 and RH30 HD")# + scale_color_viridis(discrete = TRUE, option = "D")
# Chunk 22
#get list of DEGS
#for RH30_P_vs_RH_HD
data <- RH30_P_vs_RH_HD$table
data$FDR <- p.adjust(data$PValue, method = "BH")
data$geneID <- row.names(data)
data <- data %>%
mutate(
Expression = case_when(logFC > 1 & FDR < 0.05 ~ "Up-regulated",
logFC < -1 & FDR < 0.05 ~ "Down-regulated",
TRUE ~ "Unchanged"))
DEG_list_RH30_P_vs_RH_HD<- data[data$Expression == 'Up-regulated' | data$Expression == 'Down-regulated',]
UP_DEG_list_RH30_P_vs_RH_HD <- data[data$Expression == 'Up-regulated',]
DOWN_DEG_list_RH30_P_vs_RH_HD <- data[data$Expression == 'Down-regulated',]
#write.csv(DEG_list_RH30_P_vs_RH_HD, "C:/Users/Christina/Documents/RMS RNA seq/Zoes data/DEG_list_RH30_P_vs_RH_HD.csv")
# for RH30_P_vs_RH_C7
data <- RH30_P_vs_RH_C7$table
data$FDR <- p.adjust(data$PValue, method = "BH")
data$geneID <- row.names(data)
data <- data %>%
mutate(
Expression = case_when(logFC > 1 & FDR < 0.05 ~ "Up-regulated",
logFC < -1 & FDR < 0.05 ~ "Down-regulated",
TRUE ~ "Unchanged"))
DEG_list_RH30_P_vs_RH_C7 <- data[data$Expression == 'Up-regulated' | data$Expression == 'Down-regulated',]
UP_DEG_list_RH30_P_vs_RH_C7 <- data[data$Expression == 'Up-regulated',]
DOWN_DEG_list_RH30_P_vs_RH_C7 <- data[data$Expression == 'Down-regulated',]
#write.csv(DEG_list_RH30_P_vs_RH_C7, "C:/Users/Christina/Documents/RMS RNA seq/Zoes data/DEG_list_RH30_P_vs_RH_C7.csv")
#for RH30_P_vs_RH_C13
data <- RH30_P_vs_RH_C13$table
data$FDR <- p.adjust(data$PValue, method = "BH")
data$geneID <- row.names(data)
data <- data %>%
mutate(
Expression = case_when(logFC > 1 & FDR < 0.05 ~ "Up-regulated",
logFC < -1 & FDR < 0.05 ~ "Down-regulated",
TRUE ~ "Unchanged"))
DEG_list_RH30_P_vs_RH_C13 <- data[data$Expression == 'Up-regulated' | data$Expression == 'Down-regulated',]
UP_DEG_list_RH30_P_vs_RH_C13 <- data[data$Expression == 'Up-regulated',]
DOWN_DEG_list_RH30_P_vs_RH_C13 <- data[data$Expression == 'Down-regulated',]
#write.csv(DEG_list_RH30_P_vs_RH_C13, "C:/Users/Christina/Documents/RMS RNA seq/Zoes data/DEG_list_RH30_P_vs_RH_C13.csv")
########## find overlapping upregulated DEGS ########################################
#put all data frames into list
df_list <- list(UP_DEG_list_RH30_P_vs_RH_C13, UP_DEG_list_RH30_P_vs_RH_C7, UP_DEG_list_RH30_P_vs_RH_HD)
#merge all data frames in list
overlapping_DEGS <- df_list %>% reduce(merge, by="geneID")
print(overlapping_DEGS)
########## find overlapping downregulated DEGS ########################################
#put all data frames into list
df_list <- list(DOWN_DEG_list_RH4ctrl_RH4vcr, DOWN_DEG_list_RH4ctrl_RH4ifo, DOWN_DEG_list_RMSYMctrl_RMSYMvcr, DOWN_DEG_list_RMSYMctrl_RMSYMifo)
setwd("~/RMS RNA seq/Zoes data/GSEA results")
gsea_df
################################ REACTOME #################################
# Check contrasts:
print(colnames(design))
#CAMERA takes as input the DGEList object dgeObj, the indexed list of gene sets c2.ind, the design matrix, the contrast being tested, as well as some other arguments. By default, CAMERA can estimate the correlation for each gene set separately. However, in practise, it works well to set a small inter-gene correlation of about 0.05 using the inter.gene.cor argument.
filenames <- c( "RH30_P_vs_RH_HD", "RH30_P_vs_RH_C7" , "RH30_P_vs_RH_C13")
for (i in filenames){
# Get pathways from gmt file
Reactome_pathways <- gmtPathways("C:/Users/Christina/Documents/GSEA .gmt/c2.cp.reactome.v2022.1.Hs.symbols.gmt")
#create index
reactome.ind <- ids2indices( Reactome_pathways, rownames(edgeRData$counts))
# Run analysis:
gst.camera <- camera.DGEList(edgeRData, design= design, index=reactome.ind,contrast=my.contrasts[, i], inter.gene.cor=0.05)
#get number of signif gene sets at FDR 0.05
table(gst.camera$FDR < 0.05)
#find signif gene sets at FDR 0.05
sig_genesets <- gst.camera[gst.camera$FDR < 0.05, ]
print(sig_genesets)
#ensure cols are correct class
gsea_df <- sig_genesets
gsea_df$padj <- as.numeric(gsea_df$FDR)
gsea_df$pathway <- as.character(row.names(gsea_df))
gsea_df<-gsea_df[order(gsea_df$NGenes),]
#if more than 20 significant terms only plot the 20 most significant
if(nrow(gsea_df)>=20){
p <- ggplot(gsea_df[((nrow(gsea_df)-19):nrow(gsea_df)),], aes(x = fct_reorder(pathway, - FDR), y = NGenes))+
geom_bar(aes(fill = padj), stat = "identity", width = 0.8) +
scale_fill_continuous(low = "red", high = "blue", na.value = NA, name = "p.adjust")+
scale_x_discrete(expand = expansion(add = .5)) +
coord_flip()+ try(facet_grid(~Direction))  + theme_bw() + labs(y="Count", x="")  +ggtitle(i) +
theme(axis.text=element_text(size=6))
try(ggsave(p, file=paste0("Reactome_barplot_", i,".png"), width = 35, height = 20, units = "cm"))
}else{
p <- ggplot(gsea_df, aes(x = fct_reorder(pathway, - FDR), y = NGenes))+
geom_bar(aes(fill = padj), stat = "identity", width = 0.8) +
scale_fill_continuous(low = "red", high = "blue", na.value = NA, name = "p.adjust")+
scale_x_discrete(expand = expansion(add = .5)) +
coord_flip()+ try(facet_grid(~Direction))  + theme_bw() + labs(y="Count", x="")  +ggtitle(i) +
theme(axis.text=element_text(size=6))
try(ggsave(p, file=paste0("Reactome_barplot_", i,".png"), width = 35, height = 20, units = "cm"))
}
# #create barplot of enriched pathways:
# p <- ggplot(gsea_df, aes(x = fct_reorder(pathway, - FDR), y = NGenes))+
#   geom_bar(aes(fill = gsea_df$padj), stat = "identity", width = 0.8) +
#   scale_fill_continuous(low = "red", high = "blue", na.value = NA, name = "p.adjust")+
#   scale_x_discrete(expand = expansion(add = .5)) +
#   coord_flip()+ try(facet_grid(~Direction))  + theme_bw() + labs(y="Count", x="")  +ggtitle(i) +
# theme(axis.text=element_text(size=12))
#try(ggsave(p, file=paste0("Reactome_barplot_", i,".png"), width = 30, height = 20, units = "cm"))
### add gene IDs to the dataframe
#create empty list
my_list <- list()
for (j in gsea_df$pathway){
#get the reactome indices for i
pathway <- reactome.ind[[j]]
#subset edgerdata for gene set
genes_in_geneset <- rownames(edgeRData)[pathway]
#append into a list
my_list[[length(my_list) + 1]] <- genes_in_geneset    # Append new list element
}
#name list elements as gsea pathways
names(my_list) <- c(gsea_df$pathway)
#combine list with gsea dataframe
gsea_df$gene_ids <- my_list[match(gsea_df$pathway, names(my_list))]
#chnage to character so can save as excel file
gsea_df <- apply(gsea_df,2,as.character)
#save results:
write.csv(gsea_df, file=paste0("Reactome_", i,".csv") )
}
library(dplyr)
library(readr)
library(tidyverse)
library(RColorBrewer)
library(GSVA)
library(fgsea)
library(edgeR)
library("gplots")
library("ggrepel")
library("ggplot2")
library('knitr')
################################ REACTOME #################################
# Check contrasts:
print(colnames(design))
#CAMERA takes as input the DGEList object dgeObj, the indexed list of gene sets c2.ind, the design matrix, the contrast being tested, as well as some other arguments. By default, CAMERA can estimate the correlation for each gene set separately. However, in practise, it works well to set a small inter-gene correlation of about 0.05 using the inter.gene.cor argument.
filenames <- c( "RH30_P_vs_RH_HD", "RH30_P_vs_RH_C7" , "RH30_P_vs_RH_C13")
for (i in filenames){
# Get pathways from gmt file
Reactome_pathways <- gmtPathways("C:/Users/Christina/Documents/GSEA .gmt/c2.cp.reactome.v2022.1.Hs.symbols.gmt")
#create index
reactome.ind <- ids2indices( Reactome_pathways, rownames(edgeRData$counts))
# Run analysis:
gst.camera <- camera.DGEList(edgeRData, design= design, index=reactome.ind,contrast=my.contrasts[, i], inter.gene.cor=0.05)
#get number of signif gene sets at FDR 0.05
table(gst.camera$FDR < 0.05)
#find signif gene sets at FDR 0.05
sig_genesets <- gst.camera[gst.camera$FDR < 0.05, ]
print(sig_genesets)
#ensure cols are correct class
gsea_df <- sig_genesets
gsea_df$padj <- as.numeric(gsea_df$FDR)
gsea_df$pathway <- as.character(row.names(gsea_df))
gsea_df<-gsea_df[order(gsea_df$NGenes),]
#if more than 20 significant terms only plot the 20 most significant
if(nrow(gsea_df)>=20){
p <- ggplot(gsea_df[((nrow(gsea_df)-19):nrow(gsea_df)),], aes(x = fct_reorder(pathway, - FDR), y = NGenes))+
geom_bar(aes(fill = padj), stat = "identity", width = 0.8) +
scale_fill_continuous(low = "red", high = "blue", na.value = NA, name = "p.adjust")+
scale_x_discrete(expand = expansion(add = .5)) +
coord_flip()+ try(facet_grid(~Direction))  + theme_bw() + labs(y="Count", x="")  +ggtitle(i) +
theme(axis.text=element_text(size=6))
try(ggsave(p, file=paste0("Reactome_barplot_", i,".png"), width = 35, height = 20, units = "cm"))
}else{
p <- ggplot(gsea_df, aes(x = fct_reorder(pathway, - FDR), y = NGenes))+
geom_bar(aes(fill = padj), stat = "identity", width = 0.8) +
scale_fill_continuous(low = "red", high = "blue", na.value = NA, name = "p.adjust")+
scale_x_discrete(expand = expansion(add = .5)) +
coord_flip()+ try(facet_grid(~Direction))  + theme_bw() + labs(y="Count", x="")  +ggtitle(i) +
theme(axis.text=element_text(size=6))
try(ggsave(p, file=paste0("Reactome_barplot_", i,".png"), width = 35, height = 20, units = "cm"))
}
# #create barplot of enriched pathways:
# p <- ggplot(gsea_df, aes(x = fct_reorder(pathway, - FDR), y = NGenes))+
#   geom_bar(aes(fill = gsea_df$padj), stat = "identity", width = 0.8) +
#   scale_fill_continuous(low = "red", high = "blue", na.value = NA, name = "p.adjust")+
#   scale_x_discrete(expand = expansion(add = .5)) +
#   coord_flip()+ try(facet_grid(~Direction))  + theme_bw() + labs(y="Count", x="")  +ggtitle(i) +
# theme(axis.text=element_text(size=12))
#try(ggsave(p, file=paste0("Reactome_barplot_", i,".png"), width = 30, height = 20, units = "cm"))
### add gene IDs to the dataframe
#create empty list
my_list <- list()
for (j in gsea_df$pathway){
#get the reactome indices for i
pathway <- reactome.ind[[j]]
#subset edgerdata for gene set
genes_in_geneset <- rownames(edgeRData)[pathway]
#append into a list
my_list[[length(my_list) + 1]] <- genes_in_geneset    # Append new list element
}
#name list elements as gsea pathways
names(my_list) <- c(gsea_df$pathway)
#combine list with gsea dataframe
gsea_df$gene_ids <- my_list[match(gsea_df$pathway, names(my_list))]
#chnage to character so can save as excel file
gsea_df <- apply(gsea_df,2,as.character)
#save results:
write.csv(gsea_df, file=paste0("Reactome_", i,".csv") )
}
gsea_df
View(gsea_df)
#################### HALLMARKS ###############################
#use design matrix from GLM
# Check contrasts:
print(colnames(design))
#CAMERA takes as input the DGEList object dgeObj, the indexed list of gene sets c2.ind, the design matrix, the contrast being tested, as well as some other arguments. By default, CAMERA can estimate the correlation for each gene set separately. However, in practise, it works well to set a small inter-gene correlation of about 0.05 using the inter.gene.cor argument.
filenames <- c( "RH30_P_vs_RH_HD", "RH30_P_vs_RH_C13", "RH30_P_vs_RH_HD")
for (i in filenames){
# Get pathways from gmt file
Hallmarks_pathways <- gmtPathways("C:/Users/Christina/Documents/GSEA .gmt/h.all.v2022.1.Hs.symbols.gmt")
#create index
hallmarks.ind <- ids2indices( Hallmarks_pathways, rownames(edgeRData$counts))
# Run analysis:
gst.camera <- camera.DGEList(edgeRData, design= design, index=hallmarks.ind,contrast=my.contrasts[, i], inter.gene.cor=0.05)
#get number of signif gene sets at FDR 0.05
table(gst.camera$FDR < 0.05)
#find signif gene sets at FDR 0.05
sig_genesets <- gst.camera[gst.camera$FDR < 0.05, ]
print(sig_genesets)
#ensure cols are correct class
gsea_df <- sig_genesets
gsea_df$padj <- as.numeric(gsea_df$FDR)
gsea_df$pathway <- as.character(row.names(gsea_df))
#create barplot of enriched pathways:
p <- ggplot(gsea_df, aes(x = fct_reorder(pathway, - FDR), y = NGenes))+
geom_bar(aes(fill = gsea_df$padj), stat = "identity", width = 0.8) +
scale_fill_continuous(low = "red", high = "blue", na.value = NA, name = "p.adjust")+
scale_x_discrete(expand = expansion(add = .5)) +
coord_flip()+ try(facet_grid(~Direction))  + theme_bw() + labs(y="Count", x="")  +ggtitle(i) +
theme(axis.text=element_text(size=12))
try(ggsave(p, file=paste0("Hallmarks_barplot_", i,".png"), width = 30, height = 20, units = "cm"))
### add gene IDs to the dataframe
#create empty list
my_list <- list()
for (j in gsea_df$pathway){
#get the indices for i
pathway <- hallmarks.ind[[j]]
#subset edgerdata for gene set
genes_in_geneset <- rownames(edgeRData)[pathway]
#append into a list
my_list[[length(my_list) + 1]] <- genes_in_geneset    # Append new list element
}
#name list elements as gsea pathways
names(my_list) <- c(gsea_df$pathway)
#combine list with gsea dataframe
gsea_df$gene_ids <- my_list[match(gsea_df$pathway, names(my_list))]
#chnage to character so can save as excel file
gsea_df <- apply(gsea_df,2,as.character)
#save results:
write.csv(gsea_df, file=paste0("Hallmarks_", i,".csv") )
}
##########for GO ##################################
#CAMERA takes as input the DGEList object dgeObj, the indexed list of gene sets c2.ind, the design matrix, the contrast being tested, as well as some other arguments. By default, CAMERA can estimate the correlation for each gene set separately. However, in practise, it works well to set a small inter-gene correlation of about 0.05 using the inter.gene.cor argument.
filenames <- c( "RH30_P_vs_RH_HD","RH30_P_vs_RH_C7" , "RH30_P_vs_RH_C13")
for (i in filenames){
# Get pathways from gmt file
GO_pathways <- gmtPathways("C:/Users/Christina/Documents/GSEA .gmt/c5.go.v2022.1.Hs.symbols.gmt")
#create index
GO.ind <- ids2indices( GO_pathways, rownames(edgeRData$counts))
# Run analysis:
gst.camera <- camera.DGEList(edgeRData, design= design, index=GO.ind,contrast=my.contrasts[, i], inter.gene.cor=0.05)
#get number of signif gene sets at FDR 0.05
table(gst.camera$FDR < 0.05)
#find signif gene sets at FDR 0.05
sig_genesets <- gst.camera[gst.camera$FDR < 0.05, ]
print(sig_genesets)
#ensure cols are correct class
gsea_df <- sig_genesets
gsea_df$padj <- as.numeric(gsea_df$FDR)
gsea_df$pathway <- as.character(row.names(gsea_df))
gsea_df<-gsea_df[order(gsea_df$NGenes),]
#if more than 20 significant terms only plot the 20 most significant
if(nrow(gsea_df)>=20){
p <- ggplot(gsea_df[((nrow(gsea_df)-19):nrow(gsea_df)),], aes(x = fct_reorder(pathway, - FDR), y = NGenes))+
geom_bar(aes(fill = padj), stat = "identity", width = 0.8) +
scale_fill_continuous(low = "red", high = "blue", na.value = NA, name = "p.adjust")+
scale_x_discrete(expand = expansion(add = .5)) +
coord_flip()+ try(facet_grid(~Direction))  + theme_bw() + labs(y="Count", x="")  +ggtitle(i) +
theme(axis.text=element_text(size=6))
try(ggsave(p, file=paste0("GO_barplot_", i,".png"), width = 35, height = 20, units = "cm"))
}else{
p <- ggplot(gsea_df, aes(x = fct_reorder(pathway, - FDR), y = NGenes))+
geom_bar(aes(fill = padj), stat = "identity", width = 0.8) +
scale_fill_continuous(low = "red", high = "blue", na.value = NA, name = "p.adjust")+
scale_x_discrete(expand = expansion(add = .5)) +
coord_flip()+ try(facet_grid(~Direction))  + theme_bw() + labs(y="Count", x="")  +ggtitle(i) +
theme(axis.text=element_text(size=6))
try(ggsave(p, file=paste0("GO_barplot_", i,".png"), width = 35, height = 20, units = "cm"))
}
# #create barplot of enriched pathways:
# p <- ggplot(gsea_df, aes(x = fct_reorder(pathway, - FDR), y = NGenes))+
#   geom_bar(aes(fill = gsea_df$padj), stat = "identity", width = 0.8) +
#   scale_fill_continuous(low = "red", high = "blue", na.value = NA, name = "p.adjust")+
#   scale_x_discrete(expand = expansion(add = .5)) +
#   coord_flip()+ try(facet_grid(~Direction))  + theme_bw() + labs(y="Count", x="")  +ggtitle(i) +
# theme(axis.text=element_text(size=10))
#
# try(ggsave(p, file=paste0("GO_barplot_", i,".png"), width = 30, height = 20, units = "cm"))
### add gene IDs to the dataframe
#create empty list
my_list <- list()
for (j in gsea_df$pathway){
#get the indices for i
pathway <- GO.ind[[j]]
#subset edgerdata for gene set
genes_in_geneset <- rownames(edgeRData)[pathway]
#append into a list
my_list[[length(my_list) + 1]] <- genes_in_geneset    # Append new list element
}
#name list elements as gsea pathways
names(my_list) <- c(gsea_df$pathway)
#combine list with gsea dataframe
gsea_df$gene_ids <- my_list[match(gsea_df$pathway, names(my_list))]
#chnage to character so can save as excel file
gsea_df <- apply(gsea_df,2,as.character)
#save results:
write.csv(gsea_df, file=paste0("GO_", i,".csv") )}
